We use personal information in this dataset to predict whether or not a patient has a certain unspecified disease.
Attributes’ information about the dataset (Disease Prediction Training.csv):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
df = pd.read_csv("Disease_Prediction_Training.csv")
data = pd.read_csv("Disease_Prediction_Training.csv")
data.head()
data.shape
We can see this dataset has 49000 records, 12 columns
data.dtypes
data.isnull().sum()
After checking missing value, we can see there is no missing value in this dataset, but we still need to explore every data is right and reasonable.
data.describe()
From data.describe, we can see the mean of age is about 53, the youngest person is 29 years old, and the oldest person is 64 years old. The mean of Height is 164.4cm, the shortest person is 55cm, however according to wiki records, the world's shortest person is 54.6cm, so it is possible that so many people below normal height.
fig = px.box(data, y="Height")
fig.show()
data[data.Height<60]
fig = px.box(data, y="Weight")
fig.show()
Like the issues of height, there is also no possible that exists a person who is 10 kg weight. According to wiki records, the skinniest woman alive currently is 26 kg. Therefore, I decide to drop that the weights are below 30 kg.
data[data.Weight<30]
fig = px.box(data, y="Low Blood Pressure")
fig.show()
data[data["Low Blood Pressure"]<200]
fig = px.box(data, y="High Blood Pressure")
fig.show()
data[data["High Blood Pressure"]<90]
data["High Blood Pressure"].unique()
data[(data["High Blood Pressure"]>1000) &(data["High Blood Pressure"]<2000) ]
data["Low Blood Pressure"].unique()
def blood_pressure(x):
if x < 0:
x = abs(x)
elif x > 0 and x <30:
x = x * 10
elif x> 300 and x <= 2000:
x = int(x/10)
elif x > 2000:
x = int(x/100)
else:
x = x
return x
data["Low Blood Pressure"] = data["Low Blood Pressure"].apply(blood_pressure)
data["Low Blood Pressure"].unique()
data["High Blood Pressure"] = data["High Blood Pressure"].apply(blood_pressure)
data["High Blood Pressure"].unique()
data.shape
data[data["High Blood Pressure"] < data["Low Blood Pressure"]]
high = data['High Blood Pressure']
data.insert(6,'High Blood Pressure_copy',high)
data["High Blood Pressure"][data['High Blood Pressure']<data['Low Blood Pressure']]=data['Low Blood Pressure']
data["Low Blood Pressure"][data['Low Blood Pressure']>data['High Blood Pressure_copy']]=data['High Blood Pressure_copy']
data = data.drop('High Blood Pressure_copy',axis=1)
data["High Blood Pressure"][data['High Blood Pressure']<120]=120
data["Low Blood Pressure"][data['Low Blood Pressure']>90]=90
#data = data[(data["High Blood Pressure"] >= 120) & (data["Low Blood Pressure"] >0) & (data["Low Blood Pressure"] <=90)]
data = data[data['Low Blood Pressure'] > 0]
data
fig = px.histogram(data, x="Cholesterol")
plt.figure(figsize=(20,10))
fig.show()
fig = px.histogram(data, x="Glucose")
plt.figure(figsize=(20,10))
fig.show()
fig = px.histogram(data, x="Gender")
fig.show()
fig = px.histogram(data, x="Height")
fig.show()
fig = px.histogram(data, x="Weight")
fig.show()
fig = px.histogram(data, x="High Blood Pressure")
fig.show()
fig = px.histogram(data, x="Low Blood Pressure")
fig.show()
This dataset doesn't have any missing value, but its bloodpressure has a lot of problems, some of them are over reasonable value, so I used apply function in pandas to make them more sense, and set the highest value of LowBloodPressure is 90, the lowest value of HighBloodPressure is 120 to make sure bloodpressure is in a reasonable and scientific range.
data.Gender.unique()
data = data.join(pd.get_dummies(data.Gender,prefix='Gender'))
data.Cholesterol.unique()
data = data.join(pd.get_dummies(data.Cholesterol,prefix='Cholesterol'))
data.Glucose.unique()
data = data.join(pd.get_dummies(data.Glucose,prefix='Glucose'))
data
data = data.drop(['Gender','Cholesterol','Glucose'],axis=1)
data
X = data.drop(['Disease'],axis = 1)
y = data['Disease']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
In this section, I dealt with catorical data, such as gender, cholesterol and glucose, to change them into numerical data, then split data into training data and testing data with 0.67, 0.33, respectively. Last, in order to make the value of all features on the target more balanced, I standarized dataset for modeling.
def plot_roc_auc(labels, predict_prob):
false_positive_rate,true_positive_rate,thresholds=roc_curve(labels, predict_prob)
roc_auc=auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate,'b',label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.show()
gnb = GaussianNB()
gnb.fit(X_train_std, y_train)
gnb_pred = gnb.predict(X_test_std)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, gnb_pred)*100, 2)}%")
plot_roc_auc(y_test, gnb_pred)
gnb = MultinomialNB()
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_test)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, gnb_pred)*100, 2)}%")
plot_roc_auc(y_test, gnb_pred)
best_gnb_roc_auc = 0
Max_accuracy = 0
for i in range(4000,5000,100):
gnb = MultinomialNB(alpha=i)
gnb.fit(X_train, y_train)
gnb_pred = gnb.predict(X_test)
false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test, gnb_pred)
roc_auc=auc(false_positive_rate, true_positive_rate)
best_gnb_roc_auc = max(best_gnb_roc_auc,roc_auc)
print(i)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, gnb_pred)*100, 2)}%")
Max_accuracy = max(Max_accuracy,round(metrics.accuracy_score(y_test, gnb_pred)*100, 2))
print("Best accuracy for Tuning Model is:", Max_accuracy)
print("Best AUC for Tuning Model is:",best_gnb_roc_auc)
First, I tried to use gaussian naive bayes to predict the target, I can see the dataset is not a normarized data distribution from data visualization. Therefore I used multinomial naive bayes, it is suitable for classification with discrete features, it has 3 parameters, I found a value to adjust alpha which is to smooth parameter, according to auc, the best model can arrive 71.26% acurracy.
best_nb = MultinomialNB(alpha=4700)
best_nb.fit(X_train, y_train)
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, y_pred)*100, 2)}%")
plot_roc_auc(y_test, y_pred)
std_classifier = KNeighborsClassifier()
classifier.fit(X_train_std, y_train)
knn_pred = classifier.predict(X_test_std)
print(f"Accuracy: {round(metrics.accuracy_score(y_test, y_pred)*100, 2)}%")
plot_roc_auc(y_test, knn_pred)
#cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=16)
cv = KFold(n_splits=10, shuffle=True, random_state=10)
from sklearn.model_selection import GridSearchCV
knn_param_grid = {'n_neighbors': range(5, 50,5),
'leaf_size': range(5, 50,5)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid,cv=cv)
best_knn_pred = knn_grid.fit(X_train_std, y_train)
best_knn_pred = best_knn_pred.predict(X_test_std)
plot_roc_auc(y_test,best_knn_pred)
knn_grid.best_params_
I used data with standarize and without standarize, and then used gridsearch to find the best parameters. Finally, the auc of the best model is 0.7264
best_knn = KNeighborsClassifier(leaf_size=5, n_neighbors=45)
best_knn.fit(X_train_std,y_train)
lin_clf = svm.LinearSVC(dual=False)
lin_clf
lin_clf.fit(X_train_std,y_train)
lin_clf_pred = lin_clf.predict(X_test_std)
plot_roc_auc(y_test, lin_clf_pred)
bs = model_selection.ShuffleSplit(n_splits=15,test_size=0.3,random_state=0)
param_grid = {'C':[0.001,0.01,0.05,0.1,0.25,0.5],'penalty':['l2']}
gridbs = GridSearchCV(lin_clf,param_grid,cv=bs)
gridbs.fit(X_train_std,y_train)
acc = cross_val_score(gridbs, X_train_std, y_train, cv=bs).mean()*100
acc
y_pred = gridbs.predict(X_test_std)
target_names = ["disease", "no_disease"]
print(classification_report(y_test, y_pred, target_names=target_names))
gridbs.best_params_
plot_roc_auc(y_test,y_pred)
rbf = SVC(C=0.1,kernel='rbf',gamma=0.1,cache_size=2000)
rbf.fit(X_train,y_train)
acc = cross_val_score(rbf, X_train_std, y_train, cv=bs).mean()*100
acc
non_svm_pred = rbf.predict(X_test_std)
plot_roc_auc(y_test, non_svm_pred)
param_grid = {'C':[0.1,0.5,0.8,1], 'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
gridrbf = GridSearchCV(SVC(), param_grid)
gridrbf.fit(X_train_std, y_train)
nl_svm = gridrbf.predict(X_test_std)
nl_svm
plot_roc_auc(y_test, nl_svm)
gridrbf.best_params_
C is a penalty value, more higher the C, less accepteable the error. As for Gamma, the greater the gamma, the fewer support vectors. According to gridsearch, I found the best parameter for linear SVM and non-linear SVM.
best_lin_svm = svm.LinearSVC(dual=False,C = 0.01, penalty = 'l2')
best_lin_svm.fit(X_train_std, y_train)
best_nonlin_svm = SVC(C=0.8,kernel='rbf',gamma=0.1,cache_size=2000)
best_nonlin_svm.fit(X_train_std, y_train)
from sklearn.ensemble import RandomForestClassifier
num_trees = 100
rf = RandomForestClassifier(n_estimators=num_trees, max_features=10, random_state=16,bootstrap=True)
results = cross_val_score(rf, X_train_std, y_train, cv=cv)
print(f"Accuracy: {round(results.mean()*100, 2)}%")
rf.fit(X_train_std, y_train)
rf_pred =rf.predict(X_test_std)
plot_roc_auc(y_test, rf_pred)
from sklearn.model_selection import GridSearchCV
rf_param_grid = {'n_estimators': range(60, 160,20),
'max_depth':range(1, 10),
'max_features': range(5, 10)}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param_grid)
best_rf_pred = rf_grid.fit(X_train_std, y_train)
best_rf_pred = best_rf_pred.predict(X_test_std)
plot_roc_auc(y_test, best_rf_pred)
rf_grid.best_params_
n_estimators is the number of trees in the forest, max_depth is the maximum depth of the tree, max_features is the number of features to consider when looking for the best split. I can get the best parameter by gridsearch.
best_rf = RandomForestClassifier(n_estimators=120, max_depth = 8,max_features=7, random_state=16,bootstrap=True)
best_rf.fit(X_train_std, y_train)
from sklearn.ensemble import GradientBoostingClassifier as gbm
model = gbm(n_estimators=num_trees, random_state=16)
results = cross_val_score(model, X_train_std, y_train, cv=cv)
print(f"Accuracy for GBM: {round(results.mean()*100, 2)}%")
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate': np.arange(0.02, 0.1, 0.01),
'n_estimators': range(60, 100,20),
'max_features':range(1, 10),
'max_depth': range(2, 5)}
clf = GridSearchCV(gbm(), param_grid)
clf.fit(X_train_std, y_train)
print(f"Accuracy for best GBM: {round(clf.best_score_*100, 2)}%")
for key, val in clf.best_params_.items():
print(f"Best hyperparameter is {key}: {val}")
gbm_pred =clf.predict(X_test_std)
plot_roc_auc(y_test,gbm_pred)
As I can see from above model prediction, Gradient Boosting is the best model I can make, I adjusted learning rate, max_depth, n_estimators by gridsearch in order to optismate this model.learning rate shrinks the contribution of each tree by learning_rate, n_estimators are the number of boosting stages to perform, max_depth is maximum depth of the individual regression estimators.
best_gbm = gbm(n_estimators=80,learning_rate = 0.06, max_depth = 4,max_features = 4,random_state=16)
best_gbm.fit(X_train_std, y_train)
testing_data = pd.read_csv('./Disease_Prediction_Testing.csv')
testing = pd.read_csv('./Disease_Prediction_Testing.csv')
testing_data
testing_data =testing_data.join(pd.get_dummies(testing_data.Gender,prefix='Gender'))
testing_data =testing_data.join(pd.get_dummies(testing_data.Cholesterol,prefix='Cholesterol'))
testing_data =testing_data.join(pd.get_dummies(testing_data.Glucose,prefix='Glucose'))
testing_data = testing_data.drop(['ID','Gender','Cholesterol','Glucose'],axis=1)
scaler = StandardScaler()
scaler.fit(testing_data)
testing_data_std = scaler.transform(testing_data)
nb_pred = best_nb.predict(testing_data)
knn_pred = best_knn.predict(testing_data_std)
lin_pred = best_lin_svm.predict(testing_data_std)
non_pred = best_nonlin_svm.predict(testing_data_std)
rf_pred = best_rf.predict(testing_data_std)
gbm_pred = best_gbm.predict(testing_data_std)
result = pd.DataFrame({'ID':testing.ID,'Naive_Bayes':nb_pred,'KNN':knn_pred,'Linear_SVM':lin_pred,'Non_SVM':non_pred,
'Random Forest':rf_pred,'Gradient Boosting': gbm_pred})
result.to_csv('homework_3_YueyuanHe_results.csv')
Above all, I explored this dataset and then processed data for building models. From above six models' result, I got their results and auc for each model, I used some optimal model methods to improve performance of models, such as gridsearch. Finally, I think Gradient Boosting is the best model for now to predict whether a person gets disease or not, it's almost 73.87%.